{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "from torchmoji.sentence_tokenizer import SentenceTokenizer\n",
    "from torchmoji.model_def import torchmoji_feature_encoding\n",
    "from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "source_data_dir = Path(r\"F:\\Datasets\\Fairness\\TweetAAE\\Processed\\sentiment_race\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = {}\n",
    "for name in ['pos_pos', 'pos_neg', 'neg_pos', 'neg_neg']:\n",
    "    with open(source_data_dir / (name + \"_text\"), encoding=\"latin-1\") as f:\n",
    "        _texts = f.readlines()\n",
    "        _texts = [_t.strip() for _t in _texts]\n",
    "        texts[name] = _texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokenizing using dictionary from d:\\Project\\torchMoji/model/vocabulary.json\n"
     ]
    }
   ],
   "source": [
    "print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))\n",
    "with open(VOCAB_PATH, 'r') as f:\n",
    "    vocabulary = json.load(f)\n",
    "st = SentenceTokenizer(vocabulary, 32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 73094/73094 [00:12<00:00, 5693.15it/s]\n",
      "100%|██████████| 100007/100007 [00:18<00:00, 5293.93it/s]\n",
      "100%|██████████| 44059/44059 [00:07<00:00, 5879.52it/s]\n",
      "100%|██████████| 100001/100001 [00:17<00:00, 5664.65it/s]\n"
     ]
    }
   ],
   "source": [
    "tokenized_texts = {}\n",
    "for name in texts.keys():\n",
    "    tokenized = []\n",
    "    for _text in tqdm(texts[name]):\n",
    "        try:\n",
    "            _tokenized, _, _ = st.tokenize_sentences([_text])\n",
    "        except:\n",
    "            _tokenized = [None]\n",
    "        tokenized.append(_tokenized[0])\n",
    "    tokenized_texts[name] = tokenized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dfs = {}\n",
    "for name in texts.keys():\n",
    "    data_dfs[name] = pd.DataFrame({\n",
    "        \"text\":texts[name],\n",
    "        \"tokenized_texts\":tokenized_texts[name],\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>tokenized_texts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Yayyyy the macs fixed</td>\n",
       "      <td>[1, 10, 12646, 2259, 0, 0, 0, 0, 0, 0, 0, 0, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Ik nothing about basketball sooooo imma keep q...</td>\n",
       "      <td>[3701, 190, 52, 1998, 1, 2831, 276, 1531, 34, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Monsters and coffee at 9 o'clock!! Oh ya , it'...</td>\n",
       "      <td>[5357, 12, 511, 56, 4, 5042, 50, 501, 1964, 14...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I can't wait until Monday &amp; gt ; &amp; gt ; &amp; gt ;...</td>\n",
       "      <td>[18, 164, 262, 393, 1875, 193, 2952, 385, 193,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Surprise mudding trip</td>\n",
       "      <td>[1146, 22084, 978, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100002</th>\n",
       "      <td>_TWITTER-ENTITY_ Hahaha lolololol it means dir...</td>\n",
       "      <td>[1, 1, 1, 1561, 11295, 26, 689, 3111, 1156, 1,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100003</th>\n",
       "      <td>_TWITTER-ENTITY_ haha I eat cats dogs are nast...</td>\n",
       "      <td>[1, 1, 1, 924, 18, 330, 2050, 900, 24, 1062, 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100004</th>\n",
       "      <td>_TWITTER-ENTITY_ the facial expressions are th...</td>\n",
       "      <td>[1, 1, 1, 10, 4884, 11677, 24, 10, 204, 805, 4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100005</th>\n",
       "      <td>Well that was extremely awkward</td>\n",
       "      <td>[100, 23, 25, 662, 2064, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100006</th>\n",
       "      <td>_TWITTER-ENTITY_ damn I'll find them twins bro</td>\n",
       "      <td>[1, 1, 1, 475, 396, 271, 94, 3717, 1395, 0, 0,...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100007 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     text  \\\n",
       "0                                   Yayyyy the macs fixed   \n",
       "1       Ik nothing about basketball sooooo imma keep q...   \n",
       "2       Monsters and coffee at 9 o'clock!! Oh ya , it'...   \n",
       "3       I can't wait until Monday & gt ; & gt ; & gt ;...   \n",
       "4                                   Surprise mudding trip   \n",
       "...                                                   ...   \n",
       "100002  _TWITTER-ENTITY_ Hahaha lolololol it means dir...   \n",
       "100003  _TWITTER-ENTITY_ haha I eat cats dogs are nast...   \n",
       "100004  _TWITTER-ENTITY_ the facial expressions are th...   \n",
       "100005                    Well that was extremely awkward   \n",
       "100006     _TWITTER-ENTITY_ damn I'll find them twins bro   \n",
       "\n",
       "                                          tokenized_texts  \n",
       "0       [1, 10, 12646, 2259, 0, 0, 0, 0, 0, 0, 0, 0, 0...  \n",
       "1       [3701, 190, 52, 1998, 1, 2831, 276, 1531, 34, ...  \n",
       "2       [5357, 12, 511, 56, 4, 5042, 50, 501, 1964, 14...  \n",
       "3       [18, 164, 262, 393, 1875, 193, 2952, 385, 193,...  \n",
       "4       [1146, 22084, 978, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "...                                                   ...  \n",
       "100002  [1, 1, 1, 1561, 11295, 26, 689, 3111, 1156, 1,...  \n",
       "100003  [1, 1, 1, 924, 18, 330, 2050, 900, 24, 1062, 1...  \n",
       "100004  [1, 1, 1, 10, 4884, 11677, 24, 10, 204, 805, 4...  \n",
       "100005  [100, 23, 25, 662, 2064, 0, 0, 0, 0, 0, 0, 0, ...  \n",
       "100006  [1, 1, 1, 475, 396, 271, 94, 3717, 1395, 0, 0,...  \n",
       "\n",
       "[100007 rows x 2 columns]"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dfs[\"pos_neg\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>tokenized_texts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>37519</th>\n",
       "      <td>_TWITTER-ENTITY_ tell me it's not funny !</td>\n",
       "      <td>[1, 1, 1, 307, 76, 68, 20, 288, 19, 0, 0, 0, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99003</th>\n",
       "      <td>â _TWITTER-ENTITY_ : wtf do you mean you don...</td>\n",
       "      <td>[1, 1, 1, 1, 1, 1, 148, 1047, 126, 13, 376, 13...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70610</th>\n",
       "      <td>Everyone has their opinion so SHUT THE FUCK UP...</td>\n",
       "      <td>[314, 81, 74, 735, 60, 505, 10, 133, 112, 386,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72997</th>\n",
       "      <td>I actually put jeans on today and I already wa...</td>\n",
       "      <td>[18, 286, 250, 2883, 44, 468, 12, 18, 386, 113...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71307</th>\n",
       "      <td>_TWITTER-ENTITY_ LMFAO . I actually sing good ...</td>\n",
       "      <td>[1, 1, 1, 1916, 11, 18, 286, 1613, 32, 12, 18,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38855</th>\n",
       "      <td>What do ducks smoke ? Quack .</td>\n",
       "      <td>[39, 126, 10086, 1273, 61, 19412, 11, 0, 0, 0,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69142</th>\n",
       "      <td>_TWITTER-ENTITY_ we went into protective mode ...</td>\n",
       "      <td>[1, 1, 1, 54, 239, 185, 8056, 3638, 1, 40, 362...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22623</th>\n",
       "      <td>Spending all day tomorrow studying for exams</td>\n",
       "      <td>[1263, 40, 196, 1576, 2895, 16, 4035, 0, 0, 0,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83328</th>\n",
       "      <td>_TWITTER-ENTITY_ hahah , you flipped out when ...</td>\n",
       "      <td>[1, 1, 1, 2658, 14, 13, 8578, 53, 63, 13, 627,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76458</th>\n",
       "      <td>Gah man you shouldn't have run your mouth</td>\n",
       "      <td>[7880, 183, 13, 1126, 33, 466, 27, 583, 0, 0, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>44000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text  \\\n",
       "37519          _TWITTER-ENTITY_ tell me it's not funny !   \n",
       "99003  â _TWITTER-ENTITY_ : wtf do you mean you don...   \n",
       "70610  Everyone has their opinion so SHUT THE FUCK UP...   \n",
       "72997  I actually put jeans on today and I already wa...   \n",
       "71307  _TWITTER-ENTITY_ LMFAO . I actually sing good ...   \n",
       "...                                                  ...   \n",
       "38855                      What do ducks smoke ? Quack .   \n",
       "69142  _TWITTER-ENTITY_ we went into protective mode ...   \n",
       "22623       Spending all day tomorrow studying for exams   \n",
       "83328  _TWITTER-ENTITY_ hahah , you flipped out when ...   \n",
       "76458          Gah man you shouldn't have run your mouth   \n",
       "\n",
       "                                         tokenized_texts  \n",
       "37519  [1, 1, 1, 307, 76, 68, 20, 288, 19, 0, 0, 0, 0...  \n",
       "99003  [1, 1, 1, 1, 1, 1, 148, 1047, 126, 13, 376, 13...  \n",
       "70610  [314, 81, 74, 735, 60, 505, 10, 133, 112, 386,...  \n",
       "72997  [18, 286, 250, 2883, 44, 468, 12, 18, 386, 113...  \n",
       "71307  [1, 1, 1, 1916, 11, 18, 286, 1613, 32, 12, 18,...  \n",
       "...                                                  ...  \n",
       "38855  [39, 126, 10086, 1273, 61, 19412, 11, 0, 0, 0,...  \n",
       "69142  [1, 1, 1, 54, 239, 185, 8056, 3638, 1, 40, 362...  \n",
       "22623  [1263, 40, 196, 1576, 2895, 16, 4035, 0, 0, 0,...  \n",
       "83328  [1, 1, 1, 2658, 14, 13, 8578, 53, 63, 13, 627,...  \n",
       "76458  [7880, 183, 13, 1126, 33, 466, 27, 583, 0, 0, ...  \n",
       "\n",
       "[44000 rows x 2 columns]"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = data_dfs[\"pos_neg\"]\n",
    "df = df[df[\"tokenized_texts\"].notnull()]\n",
    "df = df.sample(n=44000, random_state=2020)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name in data_dfs.keys():\n",
    "    df = data_dfs[name]\n",
    "    df = df[df[\"tokenized_texts\"].notnull()]\n",
    "    df = df.sample(n=44000, random_state=2020)\n",
    "    data_dfs[name] = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading model from d:\\Project\\torchMoji/model/pytorch_model.bin.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\Project\\torchMoji\\torchmoji\\model_def.py:159: UserWarning: nn.init.uniform is now deprecated in favor of nn.init.uniform_.\n",
      "  nn.init.uniform(self.embed.weight.data, a=-0.5, b=0.5)\n",
      "d:\\Project\\torchMoji\\torchmoji\\model_def.py:161: UserWarning: nn.init.xavier_uniform is now deprecated in favor of nn.init.xavier_uniform_.\n",
      "  nn.init.xavier_uniform(t)\n",
      "d:\\Project\\torchMoji\\torchmoji\\model_def.py:163: UserWarning: nn.init.orthogonal is now deprecated in favor of nn.init.orthogonal_.\n",
      "  nn.init.orthogonal(t)\n",
      "d:\\Project\\torchMoji\\torchmoji\\model_def.py:165: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.\n",
      "  nn.init.constant(t, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading weights for embed.weight\n",
      "Loading weights for lstm_0.weight_ih_l0\n",
      "Loading weights for lstm_0.weight_hh_l0\n",
      "Loading weights for lstm_0.bias_ih_l0\n",
      "Loading weights for lstm_0.bias_hh_l0\n",
      "Loading weights for lstm_0.weight_ih_l0_reverse\n",
      "Loading weights for lstm_0.weight_hh_l0_reverse\n",
      "Loading weights for lstm_0.bias_ih_l0_reverse\n",
      "Loading weights for lstm_0.bias_hh_l0_reverse\n",
      "Loading weights for lstm_1.weight_ih_l0\n",
      "Loading weights for lstm_1.weight_hh_l0\n",
      "Loading weights for lstm_1.bias_ih_l0\n",
      "Loading weights for lstm_1.bias_hh_l0\n",
      "Loading weights for lstm_1.weight_ih_l0_reverse\n",
      "Loading weights for lstm_1.weight_hh_l0_reverse\n",
      "Loading weights for lstm_1.bias_ih_l0_reverse\n",
      "Loading weights for lstm_1.bias_hh_l0_reverse\n",
      "Loading weights for attention_layer.attention_vector\n",
      "Ignoring weights for output_layer.0.weight\n",
      "Ignoring weights for output_layer.0.bias\n",
      "TorchMoji(\n",
      "  (embed): Embedding(50000, 256)\n",
      "  (embed_dropout): Dropout2d(p=0, inplace=False)\n",
      "  (lstm_0): LSTMHardSigmoid(256, 512, batch_first=True, bidirectional=True)\n",
      "  (lstm_1): LSTMHardSigmoid(1024, 512, batch_first=True, bidirectional=True)\n",
      "  (attention_layer): Attention(2304, return attention=False)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "print('Loading model from {}.'.format(PRETRAINED_PATH))\n",
    "model = torchmoji_feature_encoding(PRETRAINED_PATH)\n",
    "print(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 128,  943,  248, ...,    0,    0,    0],\n",
       "       [   1,    1,    1, ...,    0,    0,    0],\n",
       "       [   1,    1,    1, ...,    0,    0,    0],\n",
       "       ...,\n",
       "       [ 907,   17,   67, ...,    0,    0,    0],\n",
       "       [ 249,   10, 2895, ...,    0,    0,    0],\n",
       "       [   1,    1,    1, ...,    0,    0,    0]], dtype=uint16)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "np.stack(df[\"tokenized_texts\"].to_list(), axis=1).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "_encoding = model(np.stack(df[\"tokenized_texts\"].to_list(), axis=1).T[:200])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Encoding texts..\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\emano\\.conda\\envs\\py37\\lib\\site-packages\\torch\\nn\\functional.py:1795: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.\n",
      "  warnings.warn(\"nn.functional.tanh is deprecated. Use torch.tanh instead.\")\n"
     ]
    }
   ],
   "source": [
    "print('Encoding texts..')\n",
    "\n",
    "for name in data_dfs.keys():\n",
    "    df = data_dfs[name]\n",
    "    _encoding = model(np.stack(df[\"tokenized_texts\"].to_list(), axis=1).T)\n",
    "    df[\"encoding\"] = list(_encoding)\n",
    "    data_dfs[name] = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>tokenized_texts</th>\n",
       "      <th>encoding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>36083</th>\n",
       "      <td>I'm sitting next to the most awkward couple on...</td>\n",
       "      <td>[128, 943, 248, 17, 10, 169, 2064, 659, 44, 10...</td>\n",
       "      <td>[0.08898257, 0.014492264, -0.0073325643, 0.0, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31787</th>\n",
       "      <td>... _TWITTER-ENTITY_ you snapped me a pic of y...</td>\n",
       "      <td>[1, 1, 1, 1, 13, 6438, 76, 15, 1788, 21, 13, 1...</td>\n",
       "      <td>[-0.047121312, -0.15688002, -0.0153493155, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33964</th>\n",
       "      <td>_TWITTER-ENTITY_ I had my head covered or hand...</td>\n",
       "      <td>[1, 1, 1, 18, 58, 41, 471, 1920, 127, 868, 118...</td>\n",
       "      <td>[0.0033988312, -0.0069653085, -0.030654537, 0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26097</th>\n",
       "      <td>I just need a charger for my ipod</td>\n",
       "      <td>[18, 42, 159, 15, 1898, 16, 41, 2024, 0, 0, 0,...</td>\n",
       "      <td>[0.000186576, 0.033296738, 0.0, 0.0, -0.003505...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>89039</th>\n",
       "      <td>$18 to go to the 103rd floor &amp; amp ; stand in ...</td>\n",
       "      <td>[179, 4, 17, 157, 17, 10, 4, 1855, 1092, 193, ...</td>\n",
       "      <td>[0.0014130066, 0.004298318, -0.09110178, 0.0, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text  \\\n",
       "36083  I'm sitting next to the most awkward couple on...   \n",
       "31787  ... _TWITTER-ENTITY_ you snapped me a pic of y...   \n",
       "33964  _TWITTER-ENTITY_ I had my head covered or hand...   \n",
       "26097                  I just need a charger for my ipod   \n",
       "89039  $18 to go to the 103rd floor & amp ; stand in ...   \n",
       "\n",
       "                                         tokenized_texts  \\\n",
       "36083  [128, 943, 248, 17, 10, 169, 2064, 659, 44, 10...   \n",
       "31787  [1, 1, 1, 1, 13, 6438, 76, 15, 1788, 21, 13, 1...   \n",
       "33964  [1, 1, 1, 18, 58, 41, 471, 1920, 127, 868, 118...   \n",
       "26097  [18, 42, 159, 15, 1898, 16, 41, 2024, 0, 0, 0,...   \n",
       "89039  [179, 4, 17, 157, 17, 10, 4, 1855, 1092, 193, ...   \n",
       "\n",
       "                                                encoding  \n",
       "36083  [0.08898257, 0.014492264, -0.0073325643, 0.0, ...  \n",
       "31787  [-0.047121312, -0.15688002, -0.0153493155, -0....  \n",
       "33964  [0.0033988312, -0.0069653085, -0.030654537, 0....  \n",
       "26097  [0.000186576, 0.033296738, 0.0, 0.0, -0.003505...  \n",
       "89039  [0.0014130066, 0.004298318, -0.09110178, 0.0, ...  "
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dfs[name].iloc[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name in data_dfs.keys():\n",
    "    data_dfs[name].iloc[:40000].to_pickle(source_data_dir / (\"train_\"+name+\"_df.pkl\"))\n",
    "    data_dfs[name].iloc[40000:42000].to_pickle(source_data_dir / (\"dev_\"+name+\"_df.pkl\"))\n",
    "    data_dfs[name].iloc[42000:44000].to_pickle(source_data_dir / (\"test_\"+name+\"_df.pkl\"))"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "086b6e2ac5931a4f7a51b812f0475290a145e7d51f3e22b53c52adf5c273fe30"
  },
  "kernelspec": {
   "display_name": "Python 3.7.12 ('py37')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
